In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# TREES
# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install matplotlib;
! pip install ISLP;
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt;
from ISLP import load_data
# Load the Auto dataset from package ISLP
from ISLP import load_data
Auto = load_data('Auto')
In [42]:
# 1. Classification trees
# Define a categorical variable ECO
ECO = np.where(Auto['mpg'] > Auto['mpg'].median(), "Economy", "Consuming")
Cars = Auto.assign(ECO=ECO) # Include ECO into the data set
# Check class distribution
print(Cars['ECO'].value_counts())
ECO Consuming 196 Economy 196 Name: count, dtype: int64
In [44]:
# Build initial tree to classify ECO based on all predictors (excluding 'name' if it exists in the data)
X = Cars.drop(columns=['ECO', 'name'], errors='ignore')
y = Cars['ECO']
clf = DecisionTreeClassifier()
clf.fit(X, y)
# Display the tree structure
plot_tree(clf, feature_names=X.columns, class_names=clf.classes_, filled=True)
Out[44]:
[Text(0.5, 0.75, 'mpg <= 22.75\ngini = 0.5\nsamples = 392\nvalue = [196, 196]\nclass = Consuming'), Text(0.25, 0.25, 'gini = 0.0\nsamples = 196\nvalue = [196, 0]\nclass = Consuming'), Text(0.75, 0.25, 'gini = 0.0\nsamples = 196\nvalue = [0, 196]\nclass = Economy')]
In [48]:
# Of course, classifying ECO based on mpg is trivial!!! The tree picks this obvious split immediately.
# So, we’ll exclude mpg. We would like to predict ECO based on the car’s technical characteristics.
# Refine the tree excluding 'mpg' to predict ECO based on other technical characteristics
X_refined = Cars[['horsepower', 'weight', 'acceleration']]
clf_refined = DecisionTreeClassifier()
clf_refined.fit(X_refined, y)
# Display refined tree
plot_tree(clf_refined, feature_names=X_refined.columns, class_names=clf_refined.classes_, filled=True)
plt.show()
In [52]:
# This is too small to read. We can increase the figure size:
plt.figure(figsize=(30, 20)) # Adjust width and height as needed
plot_tree(clf_refined, feature_names=X_refined.columns, class_names=clf_refined.classes_, filled=True)
plt.show()
In [54]:
# Model Summary
print("Number of terminal nodes:", clf_refined.get_n_leaves())
print("Tree depth:", clf_refined.get_depth())
Number of terminal nodes: 51 Tree depth: 11
In [56]:
# Cross-validation for better classification rate estimation
train_data, test_data, train_labels, test_labels = train_test_split(X_refined, y, test_size=0.5, random_state=42)
clf_cv = DecisionTreeClassifier()
clf_cv.fit(train_data, train_labels)
# Predictions and confusion matrix
pred_labels = clf_cv.predict(test_data)
conf_matrix = confusion_matrix(test_labels, pred_labels)
print("Confusion matrix:\n", conf_matrix)
print("Classification accuracy:", accuracy_score(test_labels, pred_labels))
Confusion matrix: [[75 25] [ 6 90]] Classification accuracy: 0.8418367346938775
In [58]:
# Pruning the tree by cross-validation to find optimal size
path = clf_refined.cost_complexity_pruning_path(X_refined, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
# Plotting cross-validated error and deviance
scores = [cross_val_score(DecisionTreeClassifier(ccp_alpha=alpha), X_refined, y, cv=5).mean() for alpha in ccp_alphas]
plt.figure()
plt.plot(ccp_alphas, impurities, marker='o', label="Deviance")
plt.plot(ccp_alphas, scores, marker='x', label="Classification rate")
plt.xlabel("Effective alpha")
plt.ylabel("Total impurity of leaves / Classification rate")
plt.legend()
plt.show()
In [78]:
# Pruning the tree based on optimal alpha (smallest misclassification error)
optimal_alpha = ccp_alphas[np.argmax(scores)]
pruned_clf = DecisionTreeClassifier(ccp_alpha=optimal_alpha)
pruned_clf.fit(X_refined, y)
# Display pruned tree
plot_tree(pruned_clf, feature_names=X_refined.columns, class_names=pruned_clf.classes_, filled=True)
plt.show()
In [72]:
# 2. Regression Trees
from sklearn.tree import DecisionTreeRegressor
# Define the target and features for regression tree
X_reg = Cars.drop(columns=['mpg', 'name', 'origin','ECO'], errors='ignore').assign(origin=Cars['origin'].astype('category').cat.codes)
y_reg = Cars['mpg']
reg_tree = DecisionTreeRegressor()
reg_tree.fit(X_reg, y_reg)
Out[72]:
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()
In [86]:
# Display regression tree
plt.figure(figsize=(50, 40)) # Set figure size
plot_tree(
reg_tree,
feature_names=X_reg.columns,
filled=True,
fontsize=10 # Adjust this as needed for readability
)
plt.show()
# Summary of the regression tree
print("Number of terminal nodes:", reg_tree.get_n_leaves())
print("Tree depth:", reg_tree.get_depth())
print("Residual mean deviance:", np.mean((y_reg - reg_tree.predict(X_reg))**2))
Number of terminal nodes: 321 Tree depth: 17 Residual mean deviance: 0.0
In [ ]: